library(rethinking)
## Loading required package: rstan
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## Loading required package: StanHeaders
## rstan (Version 2.18.2, GitRev: 2e1f913d3ca3)
## For execution on a local, multicore CPU with excess RAM we recommend calling
## options(mc.cores = parallel::detectCores()).
## To avoid recompilation of unchanged Stan programs, we recommend calling
## rstan_options(auto_write = TRUE)
## Loading required package: parallel
## rethinking (Version 1.59)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 2.1.2 ✔ purrr 0.3.2
## ✔ tidyr 0.8.3 ✔ dplyr 0.8.1
## ✔ readr 1.3.1 ✔ stringr 1.4.0
## ✔ tibble 2.1.2 ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks rstan::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::map() masks rethinking::map()
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(rstan)
In this chapter, we will start our first statistical model, linear regression. In past statistics course, we use least square method to calculate the intercept and coefficient of the regression line. In QBS, we will again let samples do the estimation for us.
Folloing the textbook, we use the Howell1 dataset. Our goal is to build a model to describe the height data.
Create the data
data(Howell1)
d <- Howell1
d2 <- d[ d$age >= 18 , ]
Before using the LR model, we first look at the distribution of height data. It seems to follow a normal distribution.
d2 %>%
ggplot() +
geom_histogram(aes(height))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We want to use a gaussian model (normal distribution) to describe height. A normal distribution has two parameters mu and sigma. We will again use grid approximation and MCMC to estimate the 2 parameters.
We look at the result first then expain the code later.
Grid Approximation
mu and sigma.mu ~ N(178, 20) (you change it from 20 to 2 to compare the difference)sigma ~ unif(4, 9)Code Explaination
# setting grid
grid.size = 100
mu.list <- seq( from=140, to=190 , length.out=grid.size )
sigma.list <- seq( from=4 , to=9 , length.out=grid.size )
post = expand.grid( mu=mu.list , sigma=sigma.list)
post
# prior
post = post %>%
mutate(prior_mu =
apply(., 1, function(x) dnorm(x = x[["mu"]],
mean = 178,
sd = 2,
log = T)),
prior_sigma =
apply(., 1, function(x) dunif(x = x[["sigma"]],
min = 0,
max = 50,
log = T)))
post
# Model likelihood
grid_function <- function(mu, sigma){
dnorm(d2$height, mean = mu, sd = sigma, log = T) %>%
sum() }
post = post %>%
mutate(model_likelihood =
apply(.,1, function(x) grid_function(x[["mu"]], x[["sigma"]]))
) %>%
mutate(prior_likelohood = prior_mu + prior_sigma,
post_likelihood = model_likelihood + prior_mu + prior_sigma
) %>%
mutate(post_prob = exp(post_likelihood - max(post_likelihood)),
model_prob = exp(model_likelihood - max(model_likelihood)),
prior_prob = exp(prior_likelohood - max(prior_likelohood)))
post
# Plotting
{
plt1 = post %>%
ggplot()+
geom_point(aes(mu, sigma), size=50/grid.size, shape=16)+
ggtitle("raw grid")
plt2 = post %>%
ggplot()+
geom_point(aes(mu, sigma, color=prior_prob), size=50/grid.size, shape=16) +
scale_colour_gradientn(colours = topo.colors(10)) +
ggtitle("Prior")
plt3 = post %>%
ggplot() +
geom_point(aes(mu, sigma, color=model_prob), size=50/grid.size, shape=16) +
scale_colour_gradientn(colours = topo.colors(10)) +
ggtitle("Model likelihood")
plt4 = post %>%
ggplot() +
geom_point(aes(mu, sigma, color=post_prob), size=50/grid.size, shape=16) +
scale_colour_gradientn(colours = topo.colors(10)) +
ggtitle("Grid Posterior")
}
grid.arrange(plt1,plt2,plt3,plt4, ncol=1)
normal.model = "
data {
int N;
vector[N] x;
}
parameters {
real mu;
real sigma;
}
model {
// prior
mu ~ normal(178, 20);
sigma ~ uniform(0, 50);
// model
x ~ normal(mu, sigma);
}
"
normal.data = list(N = nrow(d2), x = d2$height)
normal.fit = stan(model_code = normal.model, data = normal.data, iter = 5000, chains = 2)
normal.post = normal.fit %>%
as.data.frame() %>%
select(mu, sigma)
plt5 = normal.post %>%
ggplot(aes(mu, sigma)) +
geom_bin2d(bins=100) +
scale_fill_distiller(palette=1, direction=-1) +
xlim(140, 190) +
ylim(4, 9) +
ggtitle("MCMC Posterior")
grid.arrange(plt4, plt5, ncol=1)
## Warning: Removed 1 rows containing non-finite values (stat_bin2d).
Starting from now, we will no longer use grid approximation since our problem becomes complicated.
Let’s add weight as our predictor to build a simple linear regression model.
alpha ~ normal(178, 20)beta ~ normal(0, 10)sigma ~ unif(0, 50)de-mean weight data. So our actual x in the model is weight - mean(weight) Then, we should check our prior.d2$weight.c = d2$weight - mean(d2$weight)
We started with an uneducated prior: * alpha ~ normal(178, 20) * beta ~ normal(0, 10)
We can plot the suggested regression lines by the prior by excuating these codes.
prior.data = tibble(
alpha = rnorm(100, mean = 178, sd=20),
beta = rnorm(100, mean = 0, sd = 10))
p = ggplot(data = NULL)
for (i in 1:100) {
d2$pred_height =
prior.data[[i,"alpha"]] +
prior.data[[i,"beta"]] * (d2$weight - mean(d2$weight))
p = p +
geom_line(data=d2, aes(weight, pred_height), alpha=.3)
}
p
We can add some of our common sense into our prior. For example, we know that weight can’t be negatively related with height. So we use a lognormal prior to prevent negative coefficients. The resulting prior and suggested lines now become:
alpha ~ normal(178, 20)beta ~ log-normal(0, 1) (same as log(beta) ~ normal(0, 1))sigma ~ unif(0, 50)prior.data2 = tibble(
alpha = rnorm(100, mean = 178, sd=20),
beta = rlnorm(100, mean = 0, sd = 1)) # the only thing changed
p2 = ggplot(data = NULL)
for (i in 1:100) {
d2$pred_height =
prior.data2[[i,"alpha"]] +
prior.data2[[i,"beta"]] * (d2$weight - mean(d2$weight))
p2 = p2 +
geom_line(data=d2, aes(weight, pred_height), alpha=.3)
}
p2
There is no correct prior. Priors are information we know before seeing data. Here we know that weight should not be negatively related to height. Thus a log-normal prior is provided.
When we don’t have such information, we still usually know enough about the plausible range of values.
We can try different priors to choose a reasonable one. More importantly, when we have lots of data, model likelihood will dominate the posterior.
LR.model = "
data {
int N;
vector[N] x;
vector[N] y;
}
parameters {
real alpha;
real<lower=0> beta;
real sigma;
}
model {
vector[N] mu = alpha + beta * x;
// prior
alpha ~ normal(178, 20);
beta ~ lognormal(0, 1);
sigma ~ uniform(0,50);
// model
y ~ normal(mu, sigma);
}
generated quantities {
real pred_y[N];
vector[N] mu = alpha + beta * x;
mu = alpha + beta * x;
pred_y = normal_rng(mu, sigma);
}
"
LR.data = list(
N = nrow(d2),
x = d2$weight.c,
y = d2$height
)
LR.fit = stan(model_code = LR.model,
data = LR.data,
chains = 2,
iter = 1000,
cores = 2)
LR.post = as.data.frame(LR.fit)
Once you have more than a couple of parameters in a model, it is very hard to figure out from numbers alone how all of them act to influence prediction.
The author emphasizes plotting posterior distributions and posterior predictions, instead of attempting to understand a table.
The interval for mean prediction on each weight.
pred_mu = LR.post %>%
select(contains("mu"))
CI = data.frame(
mean = pred_mu %>% apply(., 2, mean),
L_HPDI = pred_mu %>% apply(.,2,HPDI) %>% .[1,],
H_HPDI = pred_mu %>% apply(.,2,HPDI) %>% .[2,],
weight = d2$weight)
CI %>%
ggplot() +
geom_point(data = d2, aes(weight, height), color="blue", alpha=.3) +
geom_line(aes(weight, mean)) +
geom_ribbon(aes(x=weight,ymin=L_HPDI, ymax=H_HPDI), alpha=.3) +
ggtitle("89% Prediction Interval")
The interval for one prediction on each weight.
pred_y = LR.post %>%
select(contains("pred_y"))
PI = data.frame(
mean = pred_y %>% apply(., 2, mean),
L_HPDI = pred_y %>% apply(.,2,HPDI) %>% .[1,],
H_HPDI = pred_y %>% apply(.,2,HPDI) %>% .[2,],
weight = d2$weight)
PI %>%
ggplot() +
geom_point(data = d2, aes(weight, height), color="blue", alpha=.3) +
geom_line(aes(weight, mean)) +
geom_ribbon(aes(x=weight,ymin=L_HPDI, ymax=H_HPDI), alpha=.3) +
ggtitle("89% Prediction Interval")
alpha and beta.The posterior distribution is a ranking of the relative plausibilities of every possible combina- tion of parameter values.
alpha, beta and sigma.The distribution of height, is a distribution that includes sampling variation from some process that generates Gaussian random variables.
ggplot()+
geom_point(data = d2, aes(weight, height), color="blue", alpha=.3) +
geom_line(data=CI, aes(weight, mean)) +
geom_ribbon(data=CI, aes(x=weight, ymin=L_HPDI, ymax=H_HPDI), alpha=.5) +
geom_ribbon(data=PI, aes(x=weight,ymin=L_HPDI, ymax=H_HPDI), alpha=.2)
Let’s try to standardize weight and plug in full data to see if we can get better result. Surprisingly, after standardizing x, the relationship seems to be a quadratic curve. We can try to fit a PLR model.
d %>%
ggplot() +
geom_point(aes(scale(weight), height))
PLR.model = "
data {
int N;
vector[N] x;
vector[N] y;
}
parameters {
real alpha;
real<lower=0> beta1;
real beta2; // add another coefficient
real sigma;
}
model {
vector[N] mu;
// prior
alpha ~ normal(178, 20);
beta1 ~ lognormal(0, 1);
beta2 ~ normal(0, 10); // add another prior
sigma ~ uniform(0, 50);
// model
mu = alpha + beta1 * x + beta2 * (x .* x); // .* : elementwise product
y ~ normal(mu, sigma);
}
generated quantities {
real pred_y[N];
vector[N] mu;
mu = alpha + beta1 * x + beta2 * (x .* x); // .* : elementwise product
pred_y = normal_rng(mu, sigma);
}
"
PLR.data = list(
N = nrow(d),
x = scale(d$weight)[,1],
y = d$height
)
PLR.fit = stan(model_code = PLR.model,
data = PLR.data,
chains = 2,
cores = 2,
iter = 1000)
Check the result
print(PLR.fit, pars = c("alpha", "beta1", "beta2", "sigma"), probs=c(.25, .5, .75))
## Inference for Stan model: 1d9d19738f8d6840f4cd6604c250fa97.
## 2 chains, each with iter=1000; warmup=500; thin=1;
## post-warmup draws per chain=500, total post-warmup draws=1000.
##
## mean se_mean sd 25% 50% 75% n_eff Rhat
## alpha 146.69 0.01 0.38 146.42 146.68 146.96 679 1.00
## beta1 21.39 0.01 0.29 21.20 21.40 21.56 424 1.01
## beta2 -8.43 0.01 0.29 -8.62 -8.42 -8.22 593 1.00
## sigma 5.79 0.01 0.18 5.67 5.78 5.91 529 1.00
##
## Samples were drawn using NUTS(diag_e) at Wed Aug 7 14:58:31 2019.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
PLR.post = as.data.frame(PLR.fit)
The only thing we need to do is to change LR.post to PLR.post.
Data for CI:
# CI
pred_mu = PLR.post %>%
select(contains("mu"))
CI = data.frame(
mean = pred_mu %>% apply(., 2, mean),
L_HPDI = pred_mu %>% apply(.,2,HPDI) %>% .[1,],
H_HPDI = pred_mu %>% apply(.,2,HPDI) %>% .[2,],
weight = d$weight)
Data for PI:
pred_y = PLR.post %>%
select(contains("pred_y"))
PI = data.frame(
mean = pred_y %>% apply(., 2, mean),
L_HPDI = pred_y %>% apply(.,2,HPDI) %>% .[1,],
H_HPDI = pred_y %>% apply(.,2,HPDI) %>% .[2,],
weight = d$weight)
plotting the result
ggplot()+
geom_point(data = d, aes(weight, height), color="blue", alpha=.3) +
geom_line(data=CI, aes(weight, mean)) +
geom_ribbon(data=CI, aes(x=weight, ymin=L_HPDI, ymax=H_HPDI), alpha=.6) +
geom_ribbon(data=PI, aes(x=weight,ymin=L_HPDI, ymax=H_HPDI), alpha=.2)